%% This is file `elsarticle-template-3-num.tex',
%%
%% Copyright 2009 Elsevier Ltd
%%
%% This file is part of the 'Elsarticle Bundle'.
%% ---------------------------------------------
%%
%% It may be distributed under the conditions of the LaTeX Project
%% Public License, either version 1.2 of this license or (at your
%% option) any later version.  The latest version of this license is
%% in http://www.latex-project.org/lppl.txt and version 1.2 or later
%% is part of all distributions of LaTeX version 1999/12/01 or later.
%%
%% The list of all files belonging to the 'Elsarticle Bundle' is given
%% in the file `manifest.txt'.
%%
%% Template article for Elsevier's document class `elsarticle' with
%% numbered style bibliographic references
%%
%% $Id: elsarticle-template-3-num.tex 165 2009-10-08 07:58:10Z rishi $
%% $URL:
%% http://lenova.river-valley.com/svn/elsbst/trunk/elsarticle-template-3-num.tex
%% $
%%

% SUBMISSION: USE PREPRINT, FOR AN IDEA OF FINAL FORMAT USE FINAL
\documentclass[preprint,12pt]{elsarticle}
% \documentclass[final,3p,times,twocolumn]{elsarticle}

%% Use the option review to obtain double line spacing
%% \documentclass[preprint,review,12pt]{elsarticle}

%% Use the options 1p,twocolumn; 3p; 3p,twocolumn; 5p; or 5p,twocolumn
%% for a journal layout: \documentclass[final,1p,times]{elsarticle}
%% \documentclass[final,1p,times,twocolumn]{elsarticle}
%% \documentclass[final,3p,times]{elsarticle}
%% \documentclass[final,3p,times,twocolumn]{elsarticle}
%% \documentclass[final,5p,times]{elsarticle}
%% \documentclass[final,5p,times,twocolumn]{elsarticle}

%% if you use PostScript figures in your article use the graphics
%% package for simple commands \usepackage{graphics} or use the
%% graphicx package for more complicated commands
%% \usepackage{graphicx} or use the epsfig package if you prefer to
%% use the old commands \usepackage{epsfig}

%% The amssymb package provides various useful mathematical symbols
\usepackage{tikz-dependency}
\usepackage{amssymb}
%% The amsthm package provides extended theorem environments
%% \usepackage{amsthm}

%% The numcompress package shorten the last page in references.
%% `nodots' option removes dots from firstnames in references.
\usepackage[nodots]{numcompress}

%% The lineno packages adds line numbers. Start line numbering with
%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
%% for the whole article with \linenumbers after \end{frontmatter}.
%% \usepackage{lineno}

\usepackage{subfig} \usepackage{linguex} \usepackage{color}
\usepackage{comment} \usepackage{multirow}
\usepackage{xypic}
%% natbib.sty is loaded by default. However, natbib options can be
%% provided with \biboptions{...} command. Following options are
%% valid:

%% round - round parentheses are used (default) square - square
%% brackets are used [option] curly - curly braces are used {option}
%% angle - angle brackets are used <option> semicolon - multiple
%% citations separated by semi-colon colon - same as semicolon, an
%% earlier confusion comma - separated by comma numbers- selects
%% numerical citations super - numerical citations as superscripts
%% sort - sorts multiple citations according to order in ref. list
%% sort&compress - like sort, but also compresses numerical citations
%% compress - compresses without sorting
%%
%% \biboptions{comma,round}

% \biboptions{}


\journal{Journal of Biomedical Informatics}

\hyphenation{bio-lexi-con}

\begin{document}

\begin{frontmatter}

  %% Title, authors and addresses

  %% use the tnoteref command within \title for footnotes; use the
  %% tnotetext command for the associated footnote; use the fnref
  %% command within \author or \address for footnotes; use the fntext
  %% command for the associated footnote; use the corref command
  %% within \author for corresponding author footnotes; use the
  %% cortext command for the associated footnote; use the ead command
  %% for the email address, and the form \ead[url] for the home page:
  %% 
  %% \title{Title\tnoteref{label1}} \tnotetext[label1]{}
  %% \author{Name\corref{cor1}\fnref{label2}} \ead{email address}
  %% \ead[url]{home page} \fntext[label2]{} \cortext[cor1]{}
  %% \address{Address\fnref{label3}} \fntext[label3]{}

  \title{Approaches to Verb Subcategorization for
    Biomedicine}

  %% use optional labels to link authors explicitly to addresses:
  %% \author[label1,label2]{<author name>} \address[label1]{<address>}
  %% \address[label2]{<address>}

  \author[cam]{Thomas Lippincott} \ead{Thomas.Lippincott@cl.cam.ac.uk}

  \author[cam]{Laura Rimell} \ead{Laura.Rimell@cl.cam.ac.uk}

  \author[aus]{Karin Verspoor} \ead{karin.verspoor@nicta.com.au}

  %\author[col]{Helen L. Johnson} \ead{helen.johnson@ucdenver.edu}

  \author[cam]{Anna Korhonen} \ead{Anna.Korhonen@cl.cam.ac.uk}


  \address[cam]{Computer Laboratory, University of Cambridge, 15 JJ Thomson Avenue, Cambridge CB3 0FD, UK}
  \address[aus]{National ICT Australia, Victoria Research Lab, Melbourne VIC 3010, Australia}
  %\address[col]{Department of Pharmacology, Center for Computational Pharmacology, University of Colorado School of Medicine, Aurora, Colorado, Denver, CO, USA} 

  \begin{abstract}
    %% Text of abstract

    Information about verb subcategorization frames (SCFs) is
    important to many tasks in natural language processing (NLP).
    Biomedicine has a need for high-quality SCF lexicons
    to support the extraction of information from the biomedical literature, which in turn helps biologists to take advantage of the latest biomedical knowledge despite the overwhelming growth of that literature.  Unfortunately,
    techniques for creating such resources for biomedical text are
    relatively undeveloped compared to general language.  This paper
    serves as an introduction to subcategorization and existing
    approaches to acquisition, and provides motivation for developing techniques that
    address issues particularly important to biomedical NLP.  First,
    we give the traditional linguistic definition of
    subcategorization, along with several related concepts.  Second,
    we describe approaches to learning SCF lexicons from
    large data sets for general and biomedical domains.  Third, we consider the crucial issue of
    linguistic variation between biomedical fields (subdomain
    variation).  We demonstrate significant variation among subdomains, and find the variation does not simply follow patterns of general lexical variation.  Finally, we note several requirements for future
    research: a high-quality gold standard, investigation of different
    definitions of subcategorization, and minimally-supervised
    methods that can learn subdomain-specific behaviors without the
    need for extensive manual work.
  \end{abstract}

\begin{keyword}
  verb subcategorization \sep lexical resources \sep natural language
  processing \sep biomedical text processing

  %% keywords here, in the form: keyword \sep keyword

  %% MSC codes here, in the form: \MSC code \sep code or \MSC[2008]
  %% code \sep code (2000 is the default)

\end{keyword}

\end{frontmatter}

%%
%% Start line numbering here if you want
%%
% \linenumbers

%% main text
\section{Introduction}
\label{intro}

Natural language processing (NLP) has an ever-increasing importance in
biomedical informatics and systems biology due to the double-exponential growth in
research publications \citep{Hunter:2006,harmston:2010}.  NLP is essential for
managing vast amounts of unstructured text, and facilitates access to
information and data extraction that would be intractable as a manual
task.  A number of core NLP technologies used in biomedical
informatics could benefit from knowledge of {\it verb
  subcategorization}, i.e.~the tendency of verbs to ``select'' the
syntactic phrase types they co-occur with: for example, the fact that
the verb {\it decrease} can be intransitive ({\it The contribution
  decreased}), while {\it compare} cannot ({\it We compared the
  predictions}, but not simply {\it We compared}). Technologies such
as syntactic and semantic parsing, event identification, relation
extraction, and entailment detection all have the potential to make
use of subcategorization information.  For example,
\citep{ananiadou:10,rupp:10} used {\it subcategorization frames}
(SCFs) in event extraction from UKPubMedCentral documents.

Manually constructing subcategorization resources is an expensive and time-consuming task, and may fail when applied to new language domains.  It is therefore important to explore data-driven approaches that require less supervision and can be rapidly deployed for arbitrary text.  While automatic subcategorization acquisition techniques are relatively well-developed for
general English text, and several SCF lexicons have been produced \citep{korhonen:02,valex,preiss:07}, there are
few comparable resources for biomedicine.  Studies of the lexical
characteristics of text -- for example, word and part-of-speech
frequencies -- have shown substantial variation, both between general
and biomedical text and across subdomains of biomedicine
\citep{Verspoor:EtAl:09,Lippincott:2011}. It has not been determined
how much variation exists in subcategorization behavior, or whether
this variation follows the same patterns as lexical variation.

This paper has two goals.  The first is to provide the necessary background 
for future work on SCF acquisition in biomedical NLP.  To this end we 
present the traditional definition of subcategorization, and describe 
the typical state-of-the-art approach to SCF acquisition, with examples
from general and biomedical language.  The second goal is to determine 
the degree of variation in SCF behavior within biomedicine, which could 
have major implications for the success of the approach.  

% This points to a need for technology which can be used to
% automatically acquire lexical resources from texts, since it is
% impractical to develop lexicons manually for each
% subdomain. Automatic methods also facilitate the gathering of
% statistical information on the frequency of lexical items and
% linguistic contexts, which is essential for current NLP systems.

% So far, however, fully automatic methods are more common for nouns
% than verbs in biomedicine \citep{yu:02,mccrae:08,widdows:06},
% despite the fact that verbs are central to recovering the meaning
% and structure of sentences, and to discovering relations between
% biomedical entities.  A small number of resources that have been
% built to support NLP in biomedicine do contain some verb SCF
% information, including BioFrameNet \citep{dolbey:06} and the UMLS
% SPECIALIST Lexicon \citep{mccray:94}, but the SCF information is
% manually constructed.  The BioLexicon \citep{biolexicon,sasaki:08}
% is currently the only resource containing an automatically
% constructed SCF lexicon. However, the BioLexicon includes data from
% the E. Coli subdomain alone, and each component used in acquisition
% of the lexicon -- for example, the part-of-speech tagger, named
% entity recognizer, and parser -- has been manually adapted to the
% subdomain of molecular biology. Moreover, no evaluation is provided
% with this lexicon which would reveal how well the acquisition
% technology performs on E.Coli or on general biomedical corpus data.

% Before the field can develop state-of-the-art SCF acquisition
% methods for biomedicine, it is necessary to investigate a number of
% topics which will help define what such a system should look
% like. In this paper we provide the background and motivation for
% research

% subcategorization frames and frequencies, using data from across the
% PubMed Open Access collection (PMC OA) \citep{PMC:09}. This is the
% first gold standard suitable for evaluating statistical SCFs systems
% in biomedicine. We compare two automatically acquired SCF lexicons
% against this gold standard.  The first is the BioLexicon, which, as
% described above, was acquired using components individually adapted
% to subdomains of biomedicine, and applied to a corpus representing a
% particular subdomain. The second lexicon is the CamBioLex, a novel
% large SCF lexicon which we acquired from the PMC OA corpus
% (representing the largest such corpus to be used in automatic SCF
% acquisition\footnote{Although e.g.~the SPECIALIST lexicon has broad
% coverage of subdomains, it is less comprehensive due to being
% manually compiled.}, using a SCF system developed at the University
% of Cambridge \citep{preiss:07}. The Cambridge system was previously
% used for general language, and we have applied it to a biomedical
% corpus without any domain adaptation.  This paper represents the
% first evaluation and comparison of SCF systems in biomedicine and
% allows us to gain insight into how current technology performs and
% how SCF information should ideally be acquired.

% Finally, we consider how SCFs vary at the level of biomedical
% subdomains.  We use the new large CamBioLex to explore subdomain
% variation in biomedicine by measuring the difference in
% subcategorization behavior across subdomains, providing a new
% perspective on subdomain variation. We present a detailed picture of
% subdomain variation in the behavior of six representative verbs.

% Finally, we find major variation between SCF behavior in biomedical
% subdomains.  Taken together, these points suggest the need for
% minimally supervised domain adaptation which can be applied easily
% (i.e.~without substantial manual effort) to the entire biomedical
% domain as well as to different subdomains, as required. We discuss
% future approaches, and release the resources presented here with the
% article (our different gold standards, along with the large
% CamBioLex lexicon) so that they can benefit further work in this
% area.


\section{Background}
\label{background}
\subsection{Introduction to Verb Subcategorization}
\label{subcat}

The traditional linguistic notion of subcategorization refers to the
syntactic arguments of a verb, that is, the syntactic phrase types
which occur obligatorily or with high probability for any given verb.
Some common syntactic phrase types which can serve as arguments to a
verb include noun phrases, prepositional phrases, subordinate clauses,
adjectives and adverbs.

Some basic examples of subcategorization frames (SCFs) can be seen in
Table~\ref{t:basic}. For the SCF names we use COMLEX Syntax notation
\citep{grishman:94}, which includes an abbreviation for each phrase
type in the SCF. Thus the SCF for a transitive verb (taking one direct
object noun phrase) is NP, and for a verb taking a direct object and a
prepositional phrase NP-PP.  Note that we do not specify the subject
NP as part of the SCF, since subjects are obligatory in English. Most
verbs take several SCFs. In Table~\ref{t:basic}, it can be seen that
{\it decrease} may occur with the following SCFs: NP, NP-PP, or
$\oslash$ (intransitive). On the other hand, {\it compare} occurs with
the first two but not as an intransitive.  In addition to presence and
absence, SCF frames occur with different verb-specific frequencies.

\begin{table}
  \begin{tabular}{|l|p{0.8\columnwidth}|}
    \hline
    SCF & Example \\
    \hline
    \hline
    NP & The retraction screw and blade \underline{decreased} [$_{NP}$the risks of vessel injuries]. \\
    NP-PP & Heterozygosity for twine also \underline{decreases} [$_{NP}$the frequency of precocious NEB] [$_{PP}$to less than 10\%]. \\
    $\oslash$ & The contribution of cardiovascular diseases as cause of death \underline{decreased}. \\
    \hline
    NP & We \underline{compared} [$_{NP}$the performance of the Charlson and the Elixhauser comorbidity measures]. \\
    NP-PP & We \underline{compared} [$_{NP}$the predictions] [$_{PP}$to the known interaction signs]. \\
    $*$ $\oslash$ & $*$ We \underline{compared}. \\
    \hline
  \end{tabular}
  \caption{Sample SCFs for {\it describe} and {\it compare}. Note that {\it compare} does not occur as an intransitive, represented by the asterisk. All examples adapted from the PubMed Open Access (PMC OA)\cite{PMC:09} corpus.}
  \label{t:basic}
\end{table}


Additional examples of SCFs are shown in
Table~\ref{t:complex_scfs}. Here the COMLEX SCF names include
mnemonics for some additional information beyond the simple phrasal
types. For example, the frame NP-AS-NP is a subclass of NP-PP, where
the preposition is lexicalized as {\it as}. The frame NP-TOBE
represents a direct object and a predicate using {\it to be}. The
frame THAT-S represents a sentential complement introduced by the
complementizer {\it that}, and TO-INF is an infinitival complement
that uses the {\it to} form of the verb in the lower clause.

\begin{table}
  \begin{tabular}{|l|p{0.75\columnwidth}|}
    \hline
    SCF & Example \\
    \hline
    \hline
    NP-AS-NP & Perception of complex stimuli occurs too rapidly to \underline{support} rate coding as a reliable mechanism. \\
    NP-TOBE & The larger, unsaturated propyne group has been \underline{shown} to be a useful modification for antisense oligonucleotides. \\
    PP-PP & Threshold values \underline{ranged} from 0.01 to 0.99. \\
    THAT-S & Experiments with PTEN-null PGCs in culture \underline{revealed} that these cells had greater proliferative capacity.\\
    TO-INF & Administration of DA agonists to the rat PFC \underline{acts} to enhance working memory in these animals.\\
    \hline
  \end{tabular}
  \caption{Sample SCFs. All examples adapted from the PMC OA corpus.}
  \label{t:complex_scfs}
\end{table}

Comparing SCFs to another argument structure representation sometimes
used in biomedicine, SCFs are more general than Predicate-Argument
Structures (PASs), which have been used in Semantic Role Labeling
\citep{wattarujeekrit:04,tsai:05,tsai:08}. PASs include very specific
per-verb roles such as, for the verb {\it delete}, ``entity doing the
removing'', ``thing being removed'', and ``removed from''. SCFs also
do not identify thematic roles such as Agent and Patient nor
functional roles such as Subject and Object (though these types of
roles can often be inferred from the SCF), but simply the syntactic
phrase types that are selected by the verb (NP, PP, etc.). SCFs thus
provide a basic level of argument structure information which can aid
in event identification, but are general enough to be automatically
acquired for a large number of verbs, compared to PASs which must be
defined on a per-verb basis and thus can only practically be
identified for a small number of very frequent biomedical verbs.

An important concept for subcategorization is that of the {\it
  argument-adjunct} distinction, with the linguistic notion of
subcategorization -- and the one typically used in general language --
involving only arguments.  The hallmark of a syntactic {\it argument}
is that it is obligatory or very strongly selected by the
verb.\footnote{Recall, however, that most verbs take multiple SCFs
  which may involve different obligatory arguments. Therefore, the
  argument is properly considered to be obligatory with regard to the
  verb-SCF pair, not just the verb.} Arguments are distinguished from
{\it adjuncts}, which are phrases that elaborate on an event and are
generally optional. This distinction is often relevant for classifying
prepositional phrases. In particular, PPs describing location, manner,
or time tend to be adjuncts.

In Figure~\ref{f:argadj}, the PP {\it on Sunday} is optional,
elaborating on the event description by describing the time at which
the cooking event took place. The PP {\it on the patient} is
obligatory and exhibits a special, idiomatic meaning in the context of
the verb {\it operate}. The argument-adjunct distinction is sometimes
fuzzy, because the judgement of optionality can be difficult to make,
especially when a phrase type occurs with high frequency for a given
verb. However, Figure~\ref{f:argadj} illustrates another criterion,
namely that the meaning of arguments often depends on the particular
verb, while adjuncts maintain their interpretation (e.g.~temporal,
locative, manner) across a wide variety of verbal heads
\citep{grimshaw:90,pollard:87}. See \citep{merlo:06,abend:10} for
computational approaches to distinguishing arguments and adjuncts.

\begin{figure}
  \centering
  \begin{tabular}{|l|}
    \hline
    ADJUNCT:\\ 
    The chef \underline{cooked} a good lunch [$_{PP}$ on Sunday].\\[5pt]
    ARGUMENT: \\
    The surgeon \underline{operated} [$_{PP}$ on the patient]. \\
    \hline
  \end{tabular}
  \caption{Example adjunct and argument PPs.}
  \label{f:argadj}
\end{figure}


In biomedicine, subcategorization is often defined more broadly, to
include adjuncts that are less strongly selected but nevertheless
important for the complete description of an event, from the point of
view of Information Extraction. \citep{cohen:06} state that
``knowledge representation in this [biomedical] domain requires that
we {\it not} make a distinction between adjuncts and core
arguments''. As they note, the tradeoff is a loss of some ability to
generalize about adjuncts across verbs, but they argue that this loss
is outweighed by the ``biological integrity in the knowledge
representation''. Within a PAS annotation scheme, for example,
\citep{wattarujeekrit:04} includes the location PP in sentence
\ref{ex:loc} and the manner adverb in sentence \ref{ex:manner} as core
arguments, neither of which would be considered arguments in general
language. 

\ex.\label{ex:loc} Apparently HeLa cells either initiate transcription \underline{at multiple sites} \underline{within RPS14 exon 1} \ldots \citep{wattarujeekrit:04}

\ex.\label{ex:manner} Mice have previously been shown to develop
\underline{normally} \ldots \citep{wattarujeekrit:04}

Note that even under the broader definition, not every
phrase type that co-occurs with the verb is an argument;
\cite{wattarujeekrit:04} still consider aspectual or frequency adverbs
such as {\it still} or {\it always} to be adjuncts.

There have been several studies confirming the importance of relaxing
the argument-adjunct distinction in biomedicine.  For example, a study of
biomedical information extraction by \citep{rupp:10,ananiadou:10} found that 9.7\% of
verb arguments in their gold standard were correctly detected in prepositional phrases using a biomedical SCF lexicon, 
and would have been missed entirely based on the parser output alone.
The use of a more semantic criterion for
distinguishing arguments and adjuncts in biomedicine has become common.
A common implementation is to relax the definition of ``argument'' from \emph{obligatory} to \emph{high probability}, e.g. using log-likelihoods \cite{biolexicon_new}.  The semantic definition then corresponds to a lower threshold for acceptance.


% \subsection{Domain variation}
% \label{variation}


% Domain variation is widely recognized as a major factor in deploying
% and extending NLP systems.  A basic approach to automatically
% producing NLP resources is to use machine learning to train models
% on annotated data, with the assumption that the model will then be
% applied to data drawn from a similar distribution.  The degree to
% which this assumption holds determines how successful the model will
% be, and in practice this often leads to major performance
% degredation.  Different models and applications will be sensitive to
% different types of linguistic variation: for example, a document
% classifier using a bag-of-words representation will be sensitive to
% lexical variation but not to syntactic variation, while a
% lexicalised parser will be sensitive to both.  Previous studies
% \citep{Gildea:01,Clark:Curran:07} have demonstrated significant
% drops in accuracy for a parser trained on the Wall Street Journal
% section of the Penn Treebank (newswire) and tested on the Brown
% corpus section of the Treebank (mixed genre).

% When considering the transfer of NLP tools and techniques to
% biomedical text processing applications, the distance between source
% and target domains is far greater than that between the WSJ and
% Brown corpora.  Moreover, it is an important question whether
% linguistic variation between specific fields of biomedicine can
% severely impact performance.  We refer to this type of variation as
% ``subdomain variation'' to emphasize that we are considering further
% subdivisions of domain-specific data.  Previous studies of
% biomedical subdomains range from extensions of traditional
% linguistic theory \citep{Friedman:EtAl:02} to performance evaluation
% of typical NLP tasks such as parsing \citep{Rimell:Clark:09}.
% Similar motivations have driven work into variation along other
% dimensions, such as publication format \citep{Verspoor:EtAl:09}.
% These studies have all emphasized the need to consider linguistic
% variation at a finer level than ``biomedical text'', hence our focus
% on subdomains.

\section{Verb Subcategorization Frame Acquisition Systems}

Automatic SCF acquisition systems typically consist of two major
components: hypothesis generation and hypothesis selection. As a
pre-processing step, a corpus of text is processed with a natural
language parser to produce a syntactic analysis for each sentence. The
hypothesis generator uses the parser output to decide which SCF is
taken by each verb in each sentence. These hypotheses are then
amalgamated into a lexicon, which consists of each verb occurring in
the corpus with its relative frequencies for each SCF.

The larger the corpus, the more likely it is that the lexicon will
capture a comprehensive set of SCFs for each verb. However, the output
of the hypothesis generation step is typically noisy, due to the
difficulty of the task (e.g. parsing errors).  Thus a filtering step
is required to select from among the hypotheses those that are most
reliable. Filtering is a challenging task, since some SCFs are
inherently rare; infrequent attestation does not always mean an SCF should be
filtered out of the lexicon.  Ideally the filtering process does
not make use of lexical information such as verb semantic classes or SCF dictionaries, as this introduces a circular dependency,
although such resources are routinely used in real-world systems.

Within these broad outlines, approaches vary along several dimensions;
see \cite{schulteimwalde:09} for an overview. Hypothesis generation
may involve a shallow parser (chunker) or a deep grammatical
parser. The SCF inventory may be manually defined, in which case the
task of hypothesis generation involves matching the syntactic analyses
to the pre-defined SCFs; or the SCF inventory may be learned directly
from the corpus. The size of SCF inventories can vary widely between
systems, from only a few to some two hundred SCFs, although more
recent state of the art systems for general language tend to use
relatively large inventories. There are a number of mechanisms for
generating hypotheses, as well, using a variety of cues in the parsed
text to identify the SCFs.


\subsection{SCFs in general language}
\label{subcat_system}

\subsubsection{Existing resources}

There are several existing computational verb lexicons that provide
syntactic and/or semantic information for general language.  For
example, the COMLEX lexicon \citep{grishman:94} provides
subcategorization information for c.~6000 general language verbs. FrameNet
\citep{framenet} and VerbNet \citep{verbnet} provide both syntactic
and semantic information about predicate argument
structure for c.~3000 and c.~4000 verbs, respectively. PropBank \citep{propbank} is an extension of the Penn
TreeBank \citep{penntreebank} with information about
predicate-argument relationships for c.~5600 verbs. 

The VALEX \citep{valex} verb
lexicon is the largest SCF resource
available for general language. It contains SCF and frequency
information for c.~6,400 verbs learned from up to 10,000 sentences
per verb.  In contrast to the aforementioned resources, VALEX is built automatically
from large amounts of data, rather than via manual annotation.

\subsubsection{Acquisition methodology}

We now describe an example of an SCF acquisition system for general language: the state-of-the-art system used to produce the VALEX
lexicon, hereafter referred to as the \emph{Cambridge system}.  The Cambridge system operates on output from the RASP parsing suite \citep{briscoe:06}.
RASP is a modular statistical parsing suite which includes a
tokenizer, tagger, lemmatizer, and a wide-coverage unification-based
tag-sequence parser.  The parser is unlexicalized, which means it considers a sentence's sequence of part-of-speech tags (and not the words themselves).  It therefore cannot learn verb-specific behavior (like SCFs) and bias the system towards a pre-existing notion of subcategorization.  The parser's output is a dependency tree of \emph{grammatical
  relations}.  Figure \ref{dependencytree} shows the tree structure assigned to the sentence ``He knew that it was true.''

\begin{figure}
  \centering
  \begin{dependency}[theme = simple]
    \tikzstyle{verb}=[text=blue]
    \begin{deptext}[column sep=.1em]
      He \& |[verb]| knew \& that \& it \& was \& true \\
      PPIS1 \&        VVD \& CST \& PPH1 \& VBDZ \& JJ \\
    \end{deptext}
    \depedge{2}{1}{ncsubj}
    \depedge{2}{3}{ccomp}
    \depedge{3}{5}{ccomp}
    \depedge{5}{4}{ncsubj}
    \depedge{5}{6}{xcomp}
  \end{dependency}
  \caption{Example RASP output for the sentence ``He knew that it was true.''  Each arc represents a grammatical relation between two words.  This is an example of the verb ``know'' taking a sentential complement frame (THAT-S).}
  \label{dependencytree}
\end{figure}


The Cambridge system defines an SCF inventory of 163 frames.  Each frame is specified in terms of the grammatical relations connecting the verb to its arguments, the POS tags of the arguments, and some basic lexical information.  Continuing with the example sentence from figure \ref{dependencytree}, figure \ref{examplescf} shows the definition of the sentential complement frame that would match its dependency tree.  It specifies that the lexical item \emph{x} takes SCF THAT-S if 1) it is a verb, 2) it is the head in subject and complement relations, and 3) the dependent of the complement relation is also a verb with a subject.

\begin{figure}
\begin{verbatim}
(((|ncsubj| ?x ?y _) (|ccomp| ?a ?x ?v) (|ncsubj| ?v ?n _))
     (and (word-value (quote ?a) "that")
          (strict '?x '?patterns '?grs)
          (pos-start (quote ?x) "VV")
          (pos-start (quote ?v) "V"))
     (?x THAT-S))
\end{verbatim}
  \caption{Cambridge frame rule for a verb taking a sentential complement (THAT-S).  This would match the sentence in figure \ref{dependencytree}.}
  \label{examplescf}
\end{figure}

Verb instances are thus matched to SCFs, and aggregated into preliminary lexical entries for each verb, containing
the raw and relative frequencies of SCFs.  Finally, these entries are filtered to obtain a more accurate
lexicon.  The most basic approach simply removes verb-SCF pairs with a relative
frequency less than a given threshold: previous work has found a threshold of .02 to produce optimal results.
%More sophisticated filtering methods can lead to significant improvement, but 
%depend on external, and domain-specific, resources such as semantic verb classes.

This method has several drawbacks.  First, frame definitions as in figure \ref{examplescf} must be manually written and maintained: not only is this difficult work, it also ties the definitions to particular formalisms, such as the POS and grammatical relation inventories.  It also precludes the question of whether a different inventory might be more suitable for specialized language.  Second, the method is sensitive to parsing errors, which are known to increase when dealing with biomedical text \cite{verspoor:2012}.  Finally, there has been no evaluation so far of how the method performs in biomedicine.

\subsection{Biomedicine-specific SCF resources}

\subsubsection{Existing resources}

A small number of verb lexicons already exist for
biomedicine. BioFrameNet\citep{dolbey:06} extends FrameNet with
links to biomedical resources (e.g.~gene ontologies) for verb frames related to intracellular transport.  The UMLS
SPECIALIST Lexicon \citep{mccray:94} includes coarse verb
subcategorization information for some 11,000 verbs, but is manually
built from a variety of biomedical and general language
dictionaries.  BioProp \citep{tsai:05} adds PropBank-style
annotation to 500 abstracts from the GENIA corpus.  PASBio
\citep{wattarujeekrit:04} is an inventory of predicate-argument
structure frames for 30 verbs, focused on molecular biology.  The
frames were constructed through expert examination of MEDLINE
sentences, using guidelines similar to those of PropBank. The
resource most relevant to this study is the BioLexicon
\citep{biolexicon_new}, which includes semi-automatically acquired verb
subcategorization information for 658 verbs.

\subsubsection{Acquisition methodology}
\label{biolexicon}

We now describe the only existing system specifically for automatic SCF acquisition
in biomedicine, that was used to produce the BioLexicon
\citep{biolexicon_new} (hereafter referred to as the \emph{BioLexicon system}).  Where the
Cambridge system uses the unlexicalized general-language RASP parser, the
The BioLexicon system uses a version of the lexicalized Enju parser \citep{enju} that has been trained
on the GENIA treebank of molecular biology abstracts as described in \citep{hara:06}.  Like
the Cambridge system, the BioLexicon system considers a verb's grammatical relations to indicate its frame, but no SCF inventory is
assumed in advance; rather, the set of grammatical relations for
each verb instance are considered as a potential SCF.  These are
filtered at a relative frequency threshold of 0.03, i.e. for any given
verb, all SCFs with a relative frequency less than 0.03 are
discarded. To produce the lexicon, this procedure is run over six million 
words of MEDLINE E. Coli abstracts and articles, leading
to an inventory of 136 SCFs. Further
arguments and strongly-selected adjuncts are chosen according to their
log-likelihood with respect to the verb.  

It is important to note that
the BioLexicon system draws on a single subdomain of biomedical literature, and uses manually-annotated training data
that would be expensive to produce for new subdomains.
Moreover, the parsing model used in SCF discovery is lexicalized and therefore adapted to
the subcategorization phenomena present in the training data.  
While there are immediate benefits to these
approaches in terms of accuracy in SCF acquisition within the same
subdomain as the training data, the model's reliance on manual annotation
is costly, and its preconception of subcategorization may introduce
bias against new subdomain behaviors.  Finally, since the resources used to
build and evaluate the BioLexicon system are drawn from a subset of the biomedical literature,
there has been no study of how it performs on the entire range of subdomains.


\section{Investigation of Subdomain Variation}
\label{investigation_variation}

   \subsection{Motivation}
   Both systems we have described potentially suffer from the effects of subdomain variation: the Cambridge system because it is trained on general language, and the BioLexicon system because its parser is tuned on a small subset of biomedical text, and applied to abstracts regarding a single organism.  While we presently lack a gold standard for measuring absolute performance of these systems on biomedical text, we can consider the question of how much subdomains of biomedicine vary in SCF behavior.  If this variation is high, it implies that even using adapted resources like the BioLexicon system will lead to problems when applied to subdomains that it was not trained on.  The infeasibility of creating manual resources for each biomedical subdomain would then require less supervised approaches.

   \subsection{Subdomain Variation Methods}
   \label{subdomain_variation_methods}

   This section describes our approach to quantifying differences in
   verb subcategorization behavior across subdomains of biomedicine.
   The primary type of data that we investigate is a verb's {\it SCF
     distribution}, that is, the probability distribution representing
   the relative frequency of the verb appearing with a given SCF. Our goal is to discover the presence or
   absence of significant differences between a verb's SCF
   distribution in different subdomains. By investigating whether
   individual verbs exhibit specialized behavior across subdomains, we
   build up an overall picture of subdomain variation in verb
   subcategorization.

\subsubsection{Data and SCF extraction}
   To obtain the SCF distributions we use one of the general language
   systems, namely the Cambridge system, because it is unbiased with
   respect to a given subdomain of biomedicine.  
   The PubMedCorpus Open Access subset (PMC OA) includes a classification of journals by subdomain.
   We apply the Cambridge system to the 37 largest subdomains, which produces an SCF distribution for each combination of verb and subdomain.

\subsubsection{Measuring divergence}
To measure the distance between two SCF distributions we use the
Jensen-Shannon divergence (JSD) \citep{Grosse:02}, a finite and
symmetric measurement of divergence between probability distributions,
defined as:
\[
JSD = H(X + Y) - H(X) - H(Y)
\]
where \( H \) is the Shannon entropy of a distribution
\[
- \sum_x x \log{x}
\]
JSD values range between 0 (identical distributions) and 1 (disjoint
distributions), and is closely related to the familiar, but
asymmetric, Kullback-Leibler divergence \citep{cover:91}.  We
calculate the JSD between a given verb's SCF distributions for each
pair of subdomains.

\subsubsection{Presentation}
We applied this methodology to 30 verbs, and present detailed results for six: {\it develop}, {\it
  express}, {\it perform}, {\it predict}, {\it recognize} and {\it
  treat}.  These verbs were chosen because they exemplify one or more
interesting properties, such as sharp divergence in a single subdomain or a wide
variety of behaviors across all subdomains. 
For a given verb, we only show subdomains in which it occurs a minimum of
200 times. For each of the six verbs we present four different views of the data:

\emph{Heat maps} present pairwise calculations of a metric between a set of
objects: cell \( <x, y> \) is shaded according to the value of \(
metric(x, y) \).  Our heat maps show the JSD values between pairs of
subdomains for a given verb: the cells are shaded from white (JSD
value of 1, maximum divergence) to black (JSD value of 0, identity).
The actual values are inscribed in each cell.

\emph{Dendrograms} present the results of hierarchical clustering performed
directly on the JSD values.  The algorithm begins with each instance
(in our case, subdomains) as a singleton cluster, and repeatedly joins
the two most similar clusters until all the data is clustered
together.  The order of these merges is recorded as a tree structure
that can be visualised as a dendrogram in which the length of a branch
represents the distance between its child nodes.  Similarity between
clusters is calculated using average cosine distance between all
members, known as ``average linking''.  The tree leaves represent data
instances (subdomains) and the paths between them are proportional to
the pairwise distance.  This allows visualization of multiple
potential clusterings, as well as a more intuitive sense of how
distinct the clusters truly are.  Rather than choosing a set number of
flat clusters, the trees mirror the nested structure of the data.

\emph{Scatter plots} project the optimal K-Means clustering onto the first
two principal components of the data.  The optimal clustering was
determined via the Gap Statistic \citep{Tibshirani:01}, which
increases the cluster count and runs K-Means until the improvement in
error on the data is within a small range of the improvement on
randomly-generated data with similar statistical properties.  The
principal components are normalised, and points coloured according to
cluster membership, with the subdomain written immediately above.  The
clustering is performed using the full SCF distributions, while the
principle component analysis relies on decomposing the distributions
into two optimal dimensions.

\emph{Top SCF tables} show the top three SCFs for each subdomain, along with
their relative frequencies.  The SCFs are shown in their equivalent
COMLEX forms, which reflect the complements involved, as described in
Section \ref{subcat}.

\subsection{Discussion}
\label{subdomain_variation_discussion}

\subsubsection{Other views of subdomain variation}
In previous studies \cite{Verspoor:EtAl:09,Lippincott:2011} biomedical
subdomains have been compared in terms of the frequencies of basic lexical items
(verb, noun, adverb and adjective lemmas, part-of-speech tags, etc)
and using topic and selectional preference modeling methods.  The
results often contrast with those of the current paper, and we briefly
review them here for easier comparison.

In \cite{Lippincott:2011} it was found that subdomains formed stable
clusters in terms of basic lexical behavior, and several recurrent
clusters were identified, shown in Table \ref{clusters}.  The first
cluster includes subdomains dealing primarily with microscopic
processes and can be further subdivided into groupings of biochemical
(\emph{Biochemistry, Genetics}) and cellular (\emph{Cell Biology},
\emph{Embryology}) study.  The second cluster includes subdomains
focused on specific anatomical systems (\emph{Endocrinology},
\emph{Pulmonary Medicine}).  The third cluster includes subdomains
focused on clinical medicine (\emph{Psychiatry}) or specific
patient-types (\emph{Geriatrics}, \emph{Pediatrics}).  The fourth and
final cluster includes subdomains focused on social and ethical
aspects of medicine (\emph{Ethics}, \emph{Education}).

\begin{table*}[h]
    \small

  \begin{tabular}{|p{0.15\textwidth}|p{0.2\textwidth}|p{0.2\textwidth}|p{0.15\textwidth}|p{0.15\textwidth}|}
    \hline
    \multicolumn{2}{|c|}{\emph{Microscopic}} & & & \\
    \cline{1-2}
    \emph{Cellular}     & \emph{Biochemical}        & \emph{System-specific}    & \emph{Clinical}           & \emph{Social} \\
    \hline 
    Cell Biology & Biochemistry       & Endocrinology      & Geriatrics         & Ethics \\
    Virology     & Molecular Biology  & Rheumatology       & Pediatrics         & Education \\
    Microbiology & Genetics           & Pulmonary Medicine & Psychiatry         & \\
    Embryology   &                    &                    & Obstetrics         & \\
    \hline
  \end{tabular}
  \caption{Common subdomain clusters when considering lexical features.}
  \label{clusters}
\end{table*}

Almost all variation was significant at a high (\textgreater .99)
level, supporting the intuition that lexical features such as
vocabulary are primary aspects of different subdomains.  It was also
noted that the handful of syntactic features considered, such as
average sentence length and grammatical relation types, did not
necessarily align with the more stable lexical clusters.  Verbs showed
a mixture of syntactic and lexical variation, reflecting their
combined semantic and syntactic roles.

\subsubsection{Verb subcategorization behavior}
\label{subdomain_subcat_behavior}

We now discuss the results of our study of SCF behavior across subdomains as described in section \ref{subdomain_variation_methods}.
At a high level, our experiments found large differences in the amount of variation
a verb could exhibit between subdomains.  For example, the verb {\it
  induce} has a maximum JSD of .07 (low variation, between Botany and Physiology),
while {\it develop} has a maximum of .62 (high variation, between Embryology and
Therapeutics).  Similarly, some verbs shift behavior in just one or
two subdomains (e.g.~{\it activate} in Molecular Biology and
Biochemistry) while others are broadly heterogeneous (e.g.~{\it
  predict}).

In contrast to the lexical results, verb subcategorization tends to
show small pockets of specialized behavior, and the distinction
between microscopic, systemic, clinical and social subdomains is less
consistent.  Instead, there are cases where verbs have taken
on a specific usage in a single subdomain.  The clearest example of
this is {\it develop} (Figure \ref{develop:hm}), which has a
distinct emphasis on intransitive usage INTRANS in Embryology (``The
fetus develops''), compared to its typical transitive usage NP in
other subdomains (``The patient developed a tumor'').  A similar example is the
verb {\it express}, which takes NP-AS-NP-SC (``We express it as a ratio'')
frequently in most subdomains, but not in Genetics and Cell Biology,
where the simple transitive NP is unusually common.  Sometimes the reasons for specialized behavior are not so obvious:
{\it perform} behaves differently in Medical Informatics and
Education as compared to other subdomains.  Both subdomains show unusually high usage of NP-PRED-RS,
and Education is unique in its frequent use of TRANS.

Not all verb behavior follows the pattern of extreme specialization in
one or two subdomains: the heatmap for {\it predict} (Figure
\ref{predict:hm}), for example, is extremely diverse.  Looking at the
corresponding dendrogram (Figure \ref{predict:dend}) shows a clear
distinction between system-specific and clinical subdomains in the top
half, and the microscopic subdomains in the bottom half.  The top SCFs
(Table \ref{predict:table}) show that the microscopic subdomains use
{\it predict} in conjunction with infinitival forms (e.g.~NP-TOBE,
``We predicted it to be'').  {\it Recognize}, like {\it predict},
shows a diverse set of JSD values.  It is unclear why some subdomains
prefer e.g.~THAT-S or NP-AS-NP, except perhaps that diagnosis-oriented
subdomains prefer the latter.

Some verbs may have more than one specialized behavior: {\it treat} is
generally either used in a clinical sense (NP-FOR-NP, ``We treat the
patient for concussion'') or attributive (NP-AS-NP-SC, ``We treat the
infection as a separate issue''). The most distinct subdomain, Public
Health, appears as an outlier because of its unique combination of
both usages.  This is an example of a heterogeneous subdomain merging
SCF behaviors into a third, unique distribution.

%EXPAND:

There are several reasons why our results with SCFs differ from
the results obtained with lexical features in previous subdomain
comparisons \cite{Verspoor:EtAl:09,Lippincott:2011}. One factor is
that we considered individual verbs, whereas lexical studies average
variation across all lexical items of a given class.  This has a smoothing effect on the specialized behavior.  Another
factor is that distinct senses of a verb, e.g. general and
specialized, may create confounding effects when the SCF behavior of
the two senses is overlaid in a subdomain. There are two possible reasons
for this: that distinct usages exist side-by-side within individual
documents, or the subdomains are grouping together documents that are linguistically
quite different.  Either case implies that flexible, data-driven SCF lexicons
are particularly important for the PMC OA.


Our results here show that
there is considerable subdomain variation in verb SCFs in biomedicine
which should be taken into account in the development and application
of SCF systems in this domain. Future work could look at the nature of
this variation in more detail, e.g. by ~broadening the set of verbs
considered and averaging the divergence in their SCF distributions to
determine whether there is a correlation with the lexical results.
This would require a principled way of combining the distributions,
beyond simple equal weighting, because the proportion of verbs that
change SCF behavior is small and would be overwhelmed by noise.

\section{Conclusions and recommendations}
\label{conclusions}


Our study has provided some insights into the current state of verb
subcategorization frame acquisition for biomedicine.

Our review of the state of SCF acquisition in biomedical text processing has found very little in the way of direct (i.e. intrinsic) performance evaluation.  Basic questions, such as how general language systems perform on biomedicine, and how well a lexicon acquired from one subdomain translates to others, are best answered by a human-annotated gold standard.  Currently, no such resource exists representative of biomedicine in general, even as research pushes forward with domain-specific approaches.  It is crucial that we have a gold standard to guide efforts in domain adaptation, and simply to evaluate the real-world performance of proposed systems.

Although direct evaluation of SCF acquisition is important, it could
be supplemented with task-based (i.e. extrinsic) evaluation which uses the output of
a system to augment performance on a downstream task that is easier
to assess \citep{vlachos:2011}. For example, an unlexicalized parser
or relationship extractor could be augmented with SCF, and then
re-evaluated to determine improvement.  In this setup, the
definition of subcategorization and the SCF inventories used by each
system would not need to be reconciled: the candidate parses would
simply be reranked based on the new probabilities from the lexicon.
Decoupling evaluation from a particular definition and inventory would
facilitate the development and comparison of new approaches to SCF acquisition.

We found significant variation in SCF behavior between biomedical
subdomains, with different properties than in previously studied
lexical variation.  Most notably, subdomain clusters produced from the
subcategorization behavior of individual verbs did not align well with
clusters based on simple lemma frequencies \cite{Lippincott:2011}, and
often were not readily interpretable in terms of major
subdomain-spanning topics. Some verb behavior occurred in discrete
pockets, just one or two subdomains, rather than in one of the major
clusters identified in lexical studies.  While future work could
broaden the scope of these experiments and aim to obtain a more
precise idea of the nature of subdomain variation in biomedicine, the
results already presented here highlight the need for
subdomain-adaptation in SCF acquisition.

%Despite the lack of a gold standard to measure absolute performance, the high level of variation relative to subdomains and difficulties in producing manual resources for adaptation recommend less supervised approaches.
Unsupervised approaches have a particular
advantage in domain adaptation, since they do not rely on manually
created resources and because their definitions and inventories
emerge from their domain-specific input data. Ideally, such
approaches would also involve moving away from features that require
manual domain-adaptation for optimal performance (such as parser
output), to shallower and more robust features like parts-of-speech
or phrase chunking.  There are a range of semi-supervised methods
between these extremes, such as self-training and hybrid graphical
modeling \citep{zhu:2006}, which may help yield optimal performance
while minimising the need for manual annotation.  An interesting
area for future work is determining an optimal middle ground.

% \appendix

%\begin{comment}
  \newpage
  \begin{figure*}
    \centering
    \includegraphics[height=.4\textheight]{figures/develop_heatmap.png}
    \caption{Heat map of Jensen-Shannon divergence between subdomains
      for the SCF distributions of {\it develop}.}
    \label{develop:hm}
  \end{figure*}

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/develop_dendrogram.png}
  \caption{Hierarchical clustering of subdomains via average-linking
    for the SCF distributions of {\it develop}.}
  \label{develop:dend}
\end{figure*}

\newpage

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/develop_pca.png}
  \caption{Two-dimensional PCA reduction with Gap-statistic-optimal
    clustering for the SCF distributions of {\it develop}.}
  \label{develop:pca}
\end{figure*}

\begin{figure*}
  \centering \scriptsize
  \begin{tabular}{| l | l l | l l | l l |}
    \hline
    Subdomain & \multicolumn{6}{| c |}{Top three SCFs} \\
    \hline
    Psychiatry & NP & 0.399905 & NP-PRED-RS & 0.141902 & NP-FOR-NP & 0.137602 \\
    Education & NP & 0.328025 & NP-FOR-NP & 0.140127 & INTRANS & 0.121019 \\
    Environmental Health & NP & 0.309671 & INTRANS & 0.138097 & NP-FOR-NP & 0.128797 \\
    Pharmacology & NP & 0.441249 & NP-FOR-NP & 0.118324 & NP-PRED-RS & 0.115859 \\
    Geriatrics & NP & 0.390192 & NP-PRED-RS & 0.140725 & NP-FOR-NP & 0.115139 \\
    Public Health & NP & 0.361242 & NP-FOR-NP & 0.158063 & NP-PP-PRED & 0.101749 \\
    Biotechnology & NP & 0.356888 & NP-FOR-NP & 0.173096 & NP-PRED-RS & 0.098217 \\
    Biomedical Engineering & NP & 0.385159 & NP-FOR-NP & 0.169611 & NP-PP-PRED & 0.111307 \\
    Medical Informatics & NP & 0.410649 & NP-FOR-NP & 0.168911 & NP-PP-PRED & 0.083231 \\
    Obstetrics & NP & 0.315455 & INTRANS & 0.152435 & NP-PRED-RS & 0.120678 \\
    Medicine & NP & 0.345473 & NP-PRED-RS & 0.137849 & NP-PP-PRED & 0.091899 \\
    Genetics, Medical & NP & 0.303856 & NP-PRED-RS & 0.143445 & NP-FOR-NP & 0.114139 \\
    Tropical Medicine & NP & 0.345211 & INTRANS & 0.116705 & NP-PRED-RS & 0.114743 \\
    Microbiology & NP & 0.293089 & NP-FOR-NP & 0.127123 & NP-PRED-RS & 0.095342 \\
    Neoplasms & NP & 0.304064 & NP-PRED-RS & 0.147233 & NP-PP-PRED & 0.099857 \\
    Critical Care & NP & 0.340197 & NP-PRED-RS & 0.182325 & INTRANS & 0.099528 \\
    Molecular Biology & NP & 0.245846 & NP-FOR-NP & 0.156345 & NP-PP-PRED & 0.100831 \\
    Physiology & NP & 0.366467 & NP-FOR-NP & 0.131138 & NP-PRED-RS & 0.100599 \\
    Veterinary Medicine & NP & 0.287117 & NP-PRED-RS & 0.117791 & INTRANS & 0.099387 \\
    Science & NP & 0.263721 & INTRANS & 0.128445 & NP-PP-PRED & 0.109314 \\
    Genetics & NP & 0.261829 & NP-FOR-NP & 0.142401 & INTRANS & 0.107713 \\
    Neurology & NP & 0.231093 & INTRANS & 0.207683 & NP-PP-PRED & 0.103842 \\
    Cell Biology & NP & 0.223591 & INTRANS & 0.200704 & PP-PRED-RS & 0.084507 \\
    Therapeutics & NP & 0.350314 & NP-FOR-NP & 0.155172 & NP-PRED-RS & 0.101097 \\
    Endocrinology & NP & 0.273525 & INTRANS & 0.161085 & NP-PRED-RS & 0.137959 \\
    Communicable Diseases & NP & 0.287262 & NP-PRED-RS & 0.149480 & INTRANS & 0.144714 \\
    Pediatrics & NP & 0.361596 & NP-PRED-RS & 0.194514 & INTRANS & 0.124688 \\
    Biochemistry & NP & 0.285505 & INTRANS & 0.231332 & NP-FOR-NP & 0.120059 \\
    Botany & NP & 0.281346 & INTRANS & 0.189602 & NP-FOR-NP & 0.128440 \\
    Virology & NP & 0.379412 & NP-PRED-RS & 0.136275 & NP-FOR-NP & 0.109804 \\
    Gastroenterology & NP & 0.334848 & NP-PRED-RS & 0.210606 & NP-PP-PRED & 0.127273 \\
    Pulmonary Medicine & NP & 0.300429 & NP-PRED-RS & 0.158798 & NP-PP-PRED & 0.115880 \\
    Ethics & NP & 0.274298 & INTRANS & 0.228942 & NP-PP-PRED & 0.155508 \\
    Vascular Diseases & NP & 0.318367 & NP-PRED-RS & 0.155102 & INTRANS & 0.101224 \\
    Rheumatology & NP & 0.306562 & NP-PRED-RS & 0.159647 & NP-PP-PRED & 0.119491 \\
    Ophthalmology & NP & 0.245421 & NP-PRED-RS & 0.146520 & INTRANS & 0.124542 \\
    Embryology & INTRANS & 0.510504 & INTRANS-RECIPSUBJ-PL & 0.172269 & NP & 0.120798 \\

    \hline
  \end{tabular}
  \caption{Top three SCFs, by subdomain, for {\it develop}.}
  \label{develop:table}
\end{figure*}
\clearpage

\newpage
\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/express_heatmap.png}
  \caption{Heat map of Jensen-Shannon divergence between subdomains
    for the SCF distributions of {\it express}.}
  \label{express:hm}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/express_dendrogram.png}
  \caption{Hierarchical clustering of subdomains via average-linking
    for the SCF distributions of {\it express}.}
  \label{express:dend}
\end{figure*}

\newpage

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/express_pca.png}
  \caption{Two-dimensional PCA reduction with Gap-statistic-optimal
    clustering for the SCF distributions of {\it express}.}
  \label{express:pca}
\end{figure*}

\begin{figure*}
  \centering \scriptsize
  \begin{tabular}{| l | l l | l l | l l |}
    \hline
    Subdomain & \multicolumn{6}{| c |}{Top three SCFs} \\
    \hline
    Genetics & NP & 0.484719 & NP-PP-PRED & 0.088202 & NP-PRED-RS & 0.077303 \\
    Cell Biology & NP & 0.436123 & NP-PRED-RS & 0.256388 & NP-PP-PRED & 0.183260 \\
    Genetics, Medical & NP & 0.445434 & NP-PRED-RS & 0.084633 & NP-PP-PRED & 0.082405 \\
    Biochemistry & NP & 0.320611 & NP-PRED-RS & 0.122137 & NP-AS-NP-SC & 0.113700 \\
    Botany & NP & 0.457393 & NP-PRED-RS & 0.107769 & PP-PRED-RS & 0.084586 \\
    Molecular Biology & NP & 0.401806 & NP-PP-PRED & 0.151806 & NP-PRED-RS & 0.125282 \\
    Microbiology & NP & 0.393716 & NP-PRED-RS & 0.192811 & NP-PP-PRED & 0.152821 \\
    Tropical Medicine & NP & 0.362590 & NP-AS-NP-SC & 0.152518 & NP-AS-NP & 0.152518 \\
    Pharmacology & NP & 0.300459 & NP-AS-NP-SC & 0.181193 & NP-AS-NP & 0.181193 \\
    Physiology & NP & 0.320866 & NP-AS-NP-SC & 0.140748 & NP-AS-NP & 0.140748 \\
    Endocrinology & NP & 0.389426 & NP-PRED-RS & 0.131325 & NP-AS-NP-SC & 0.117112 \\
    Neoplasms & NP & 0.439103 & NP-PP-PRED & 0.200038 & NP-PRED-RS & 0.171003 \\
    Biotechnology & NP & 0.416469 & NP-PRED-RS & 0.182106 & NP-PP-PRED & 0.165479 \\
    Rheumatology & NP & 0.435431 & NP-PRED-RS & 0.136413 & NP-PP-PRED & 0.132412 \\
    Neurology & NP & 0.384721 & NP-PP-PRED & 0.137646 & NP-PRED-RS & 0.135582 \\
    Communicable Diseases & NP & 0.336735 & NP-AS-NP-SC & 0.204082 & NP-AS-NP & 0.204082 \\
    Virology & NP & 0.388041 & NP-PRED-RS & 0.227216 & NP-PP-PRED & 0.185567 \\
    Science & NP & 0.392503 & NP-PRED-RS & 0.172770 & NP-PP-PRED & 0.138302 \\
    Medicine & NP & 0.396785 & NP-PRED-RS & 0.167203 & NP-PP-PRED & 0.154984 \\
    Vascular Diseases & NP-AS-NP-SC & 0.281022 & NP-AS-NP & 0.281022 & NP & 0.253650 \\
    Pulmonary Medicine & NP & 0.328225 & NP-AS-NP-SC & 0.186462 & NP-AS-NP & 0.186462 \\
    Environmental Health & NP & 0.281679 & NP-AS-NP-SC & 0.167877 & NP-AS-NP & 0.167877 \\
    Public Health & NP & 0.266667 & NP-PP-PRED & 0.183333 & NP-PP & 0.126190 \\

    \hline
  \end{tabular}
  \caption{Top three SCFs, by subdomain, for {\it express}.}
  \label{express:table}
\end{figure*}
\clearpage

\newpage
\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/perform_heatmap.png}
  \caption{Heat map of Jensen-Shannon divergence between subdomains
    for the SCF distributions of {\it perform}.}
  \label{perform:hm}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/perform_dendrogram.png}
  \caption{Hierarchical clustering of subdomains via average-linking
    for the SCF distributions of {\it perform}.}
  \label{perform:dend}
\end{figure*}

\newpage

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/perform_pca.png}
  \caption{Two-dimensional PCA reduction with Gap-statistic-optimal
    clustering for the SCF distributions of {\it perform}.}
  \label{perform:pca}
\end{figure*}

\begin{figure*}
  \centering \scriptsize
  \begin{tabular}{| l | l l | l l | l l |}
    \hline
    Subdomain & \multicolumn{6}{| c |}{Top three SCFs} \\
    \hline
    Medical Informatics & NP & 0.361941 & NP-PP-PRED & 0.177756 & NP-PRED-RS & 0.084217 \\
    Education & NP & 0.442718 & NP-PRED-RS & 0.116505 & INTRANS & 0.100971 \\
    Molecular Biology & NP & 0.248283 & NP-ING-SC & 0.124142 & NP-ING-OC & 0.124142 \\
    Genetics, Medical & NP & 0.262342 & NP-ING-SC & 0.120675 & NP-ING-OC & 0.120675 \\
    Pharmacology & NP & 0.304765 & NP-PP-PP PFORM & 0.146923 & NP-ING-SC & 0.102581 \\
    Critical Care & NP & 0.441001 & NP-PP-PP PFORM & 0.080182 & NP-ING-SC & 0.075064 \\
    Communicable Diseases & NP & 0.431208 & NP-ING-SC & 0.075201 & NP-ING-OC & 0.075201 \\
    Gastroenterology & NP & 0.484485 & NP-PP-PP PFORM & 0.070187 & NP-ING-SC & 0.069537 \\
    Therapeutics & NP & 0.511537 & NP-FOR-NP & 0.065702 & NP-ING-SC & 0.057880 \\
    Ophthalmology & NP & 0.536599 & NP-PP-PRED & 0.057788 & NP-ING-SC & 0.055036 \\
    Obstetrics & NP & 0.454327 & NP-PP-PRED & 0.079327 & NP-ING-SC & 0.066106 \\
    Biomedical Engineering & NP & 0.393035 & NP-PP-PRED & 0.074627 & NP-TO-INF-OC & 0.073383 \\
    Pulmonary Medicine & NP & 0.374464 & NP-PP-PP PFORM & 0.094271 & NP-ING-SC & 0.092366 \\
    Medicine & NP & 0.362900 & NP-ING-SC & 0.099518 & NP-ING-OC & 0.099518 \\
    Physiology & NP & 0.394495 & NP-ING-SC & 0.083524 & NP-ING-OC & 0.083524 \\
    Neoplasms & NP & 0.382559 & NP-PP-PP PFORM & 0.091148 & NP-ING-SC & 0.083187 \\
    Rheumatology & NP & 0.333756 & NP-PP-PP PFORM & 0.106480 & NP-ING-SC & 0.089181 \\
    Neurology & NP & 0.331288 & NP-PP-PP PFORM & 0.105171 & NP-ING-SC & 0.088721 \\
    Tropical Medicine & NP & 0.370042 & NP-ING-SC & 0.105513 & NP-ING-OC & 0.105513 \\
    Psychiatry & NP & 0.381216 & NP-PRED-RS & 0.092344 & NP-PP-PRED & 0.092344 \\
    Environmental Health & NP & 0.300141 & NP-PP-PRED & 0.103796 & NP-ING-SC & 0.091065 \\
    Pediatrics & NP & 0.450953 & NP-PRED-RS & 0.073572 & NP-PP-PRED & 0.062320 \\
    Veterinary Medicine & NP & 0.407389 & NP-ING-SC & 0.099351 & NP-ING-OC & 0.099351 \\
    Vascular Diseases & NP & 0.444747 & NP-ING-SC & 0.089117 & NP-ING-OC & 0.089117 \\
    Geriatrics & NP & 0.457423 & NP-PRED-RS & 0.080250 & NP-TO-INF-VC & 0.072671 \\
    Virology & NP & 0.312346 & NP-ING-SC & 0.115070 & NP-ING-OC & 0.115070 \\
    Embryology & NP & 0.260802 & NP-ING-SC & 0.135802 & NP-ING-OC & 0.135802 \\
    Microbiology & NP & 0.276414 & NP-ING-SC & 0.126016 & NP-ING-OC & 0.126016 \\
    Botany & NP & 0.249518 & NP-ING-SC & 0.131218 & NP-ING-OC & 0.131218 \\
    Biochemistry & NP & 0.264828 & NP-ING-SC & 0.134100 & NP-ING-OC & 0.134100 \\
    Science & NP & 0.255107 & NP-ING & 0.130580 & NP-ING-OC & 0.130580 \\
    Genetics & NP & 0.305055 & NP-ING & 0.114337 & NP-ING-OC & 0.114337 \\
    Biotechnology & NP & 0.337702 & NP-ING & 0.107471 & NP-ING-OC & 0.107471 \\
    Cell Biology & NP & 0.297386 & NP-ING & 0.153232 & NP-ING-OC & 0.153232 \\
    Public Health & NP & 0.338684 & NP-PRED-RS & 0.097372 & NP-TO-INF-VC & 0.081143 \\
    Endocrinology & NP & 0.352185 & NP-ING & 0.141674 & NP-ING-OC & 0.141674 \\

    \hline
  \end{tabular}
  \caption{Top three SCFs, by subdomain, for {\it perform}.}
  \label{perform:table}
\end{figure*}
\clearpage

\newpage
\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/predict_heatmap.png}
  \caption{Heat map of Jensen-Shannon divergence between subdomains
    for the SCF distributions of {\it predict}.}
  \label{predict:hm}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/predict_dendrogram.png}
  \caption{Hierarchical clustering of subdomains via average-linking
    for the SCF distributions of {\it predict}.}
  \label{predict:dend}
\end{figure*}

\newpage

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/predict_pca.png}
  \caption{Two-dimensional PCA reduction with Gap-statistic-optimal
    clustering for the SCF distributions of {\it predict}.}
  \label{predict:pca}
\end{figure*}

\begin{figure*}
  \centering \scriptsize
  \begin{tabular}{| l | l l | l l | l l |}
    \hline
    Subdomain & \multicolumn{6}{| c |}{Top three SCFs} \\
    \hline
    Vascular Diseases & NP-PP-PRED & 0.319039 & NP & 0.259005 & NP-PRED-RS & 0.197256 \\
    Psychiatry & NP & 0.296053 & NP-PP-PRED & 0.265351 & NP-PRED-RS & 0.155702 \\
    Public Health & NP & 0.313056 & NP-PP-PRED & 0.258160 & NP-PRED-RS & 0.143917 \\
    Medicine & NP & 0.333758 & NP-PP-PRED & 0.249682 & NP-PRED-RS & 0.152866 \\
    Communicable Diseases & NP-PP-PRED & 0.272923 & NP & 0.242837 & NP-PRED-RS & 0.139685 \\
    Physiology & NP & 0.297170 & NP-PP-PRED & 0.266509 & NP-PRED-RS & 0.127358 \\
    Neoplasms & NP-PP-PRED & 0.301850 & NP & 0.252678 & NP-PP & 0.176241 \\
    Critical Care & NP & 0.321659 & NP-PP-PRED & 0.291244 & NP-PRED-RS & 0.185253 \\
    Pulmonary Medicine & NP & 0.610138 & NP-PP-PRED & 0.117051 & NP-PRED-RS & 0.073733 \\
    Rheumatology & NP & 0.287570 & NP-PP-PRED & 0.257885 & NP-PRED-RS & 0.150278 \\
    Environmental Health & NP & 0.356804 & NP-PP-PRED & 0.259309 & NP-PRED-RS & 0.119838 \\
    Neurology & NP & 0.239140 & NP-PP-PRED & 0.174610 & HAT-S & 0.115141 \\
    Biotechnology & NP & 0.304348 & NP-PP-PRED & 0.214393 & NP-TOBE & 0.143928 \\
    Virology & NP & 0.176289 & NP-TOBE & 0.139175 & NP-PP-PRED & 0.126804 \\
    Biochemistry & NP-PP-PRED & 0.190345 & NP & 0.167586 & NP-TOBE & 0.124138 \\
    Tropical Medicine & NP & 0.261468 & NP-PP-PRED & 0.133486 & NP-PRED-RS & 0.104587 \\
    Molecular Biology & NP & 0.212812 & NP-PP-PRED & 0.185082 & NP-TOBE & 0.105761 \\
    Microbiology & NP & 0.211287 & NP-TOBE & 0.165237 & NP-TO-INF-OC & 0.125508 \\
    Botany & NP & 0.265457 & NP-TOBE & 0.139535 & NP-TO-INF-OC & 0.119682 \\
    Genetics & NP & 0.258138 & NP-PP-PRED & 0.137358 & NP-TOBE & 0.103301 \\
    Genetics, Medical & NP & 0.277823 & NP-PP-PRED & 0.187652 & NP-TO-INF-OC & 0.130788 \\

    \hline
  \end{tabular}
  \caption{Top three SCFs, by subdomain, for {\it predict}.}
  \label{predict:table}
\end{figure*}
\clearpage

\newpage
\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/recognize_heatmap.png}
  \caption{Heat map of Jensen-Shannon divergence between subdomains
    for the SCF distributions of {\it recognize}.}
  \label{recognize:hm}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/recognize_dendrogram.png}
  \caption{Hierarchical clustering of subdomains via average-linking
    for the SCF distributions of {\it recognize}.}
  \label{recognize:dend}
\end{figure*}

\newpage

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/recognize_pca.png}
  \caption{Two-dimensional PCA reduction with Gap-statistic-optimal
    clustering for the SCF distributions of {\it recognize}.}
  \label{recognize:pca}
\end{figure*}

\begin{figure*}
  \centering \scriptsize
  \begin{tabular}{| l | l l | l l | l l |}
    \hline
    Subdomain & \multicolumn{6}{| c |}{Top three SCFs} \\
    \hline
    Public Health & NP & 0.257610 & NP-PP-PRED & 0.125464 & NP-AS-NP & 0.096511 \\
    Environmental Health & NP & 0.302128 & HAT-S & 0.093617 & NP-PP-PRED & 0.093617 \\
    Medicine & NP & 0.413386 & NP-PP-PRED & 0.118110 & NP-PRED-RS & 0.100394 \\
    Medical Informatics & NP & 0.332331 & NP-PP-PRED & 0.169925 & IT-PASS-SFIN & 0.075188 \\
    Tropical Medicine & NP & 0.423986 & NP-S & 0.108108 & IT-PASS-SFIN & 0.104730 \\
    Vascular Diseases & NP & 0.251641 & IT-PASS-SFIN & 0.157549 & NP-AS-NP-SC & 0.135667 \\
    Pulmonary Medicine & NP & 0.362429 & IT-PASS-SFIN & 0.132827 & NP-S & 0.121442 \\
    Neoplasms & NP & 0.447775 & NP-PP-PRED & 0.117166 & NP-AS-NP-SC & 0.101726 \\
    Neurology & NP & 0.396584 & NP-PP-PRED & 0.146110 & NP-PRED-RS & 0.104364 \\
    Rheumatology & NP & 0.505841 & NP-PP-PRED & 0.156542 & NP-PRED-RS & 0.096963 \\
    Genetics & NP & 0.491974 & NP-PP-PRED & 0.130016 & NP-PRED-RS & 0.108347 \\
    Microbiology & NP & 0.505447 & NP-PP-PRED & 0.159041 & NP-PRED-RS & 0.100218 \\
    Virology & NP & 0.525084 & NP-PP-PRED & 0.158863 & NP-PRED-RS & 0.107023 \\
    Science & NP & 0.530660 & NP-PP-PRED & 0.136792 & NP-PRED-RS & 0.106132 \\
    Communicable Diseases & NP & 0.463087 & NP-AS-NP-SC & 0.194631 & NP-AS-NP & 0.194631 \\
    Biochemistry & NP & 0.465596 & NP-PP-PRED & 0.135321 & NP-PRED-RS & 0.080275 \\

    \hline
  \end{tabular}
  \caption{Top three SCFs, by subdomain, for {\it recognize}.}
  \label{recognize:table}
\end{figure*}
\clearpage

\newpage
\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/treat_heatmap.png}
  \caption{Heat map of Jensen-Shannon divergence between subdomains
    for the SCF distributions of {\it treat}.}
  \label{treat:hm}
\end{figure*}

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/treat_dendrogram.png}
  \caption{Hierarchical clustering of subdomains via average-linking
    for the SCF distributions of {\it treat}.}
  \label{treat:dend}
\end{figure*}

\newpage

\begin{figure*}
  \centering
  \includegraphics[height=.4\textheight]{figures/treat_pca.png}
  \caption{Two-dimensional PCA reduction with Gap-statistic-optimal
    clustering for the SCF distributions of {\it treat}.}
  \label{treat:pca}
\end{figure*}

\begin{figure*}
  \centering \scriptsize
  \begin{tabular}{| l | l l | l l | l l |}
    \hline
    Subdomain & \multicolumn{6}{| c |}{Top three SCFs} \\
    \hline
    Pulmonary Medicine & NP & 0.337748 & NP-PP-PP PFORM & 0.167770 & NP-NP-PRED & 0.129139 \\
    Pharmacology & NP & 0.274845 & NP-NP-PRED & 0.184783 & NP-NP & 0.184783 \\
    Veterinary Medicine & NP & 0.360000 & NP-FOR-NP & 0.120000 & NP-PP-PP PFORM & 0.106667 \\
    Vascular Diseases & NP & 0.388060 & PP & 0.099502 & PP-PRED-RS & 0.099502 \\
    Tropical Medicine & NP & 0.425547 & NP-NP-PRED & 0.103035 & NP-NP & 0.103035 \\
    Medicine & NP & 0.355288 & NP-PP-PP PFORM & 0.126160 & NP-PRED-RS & 0.080705 \\
    Communicable Diseases & NP & 0.353806 & NP-PP-PP PFORM & 0.173010 & NP-FOR-NP & 0.121107 \\
    Neoplasms & NP & 0.314900 & NP-PP-PP PFORM & 0.219662 & PP-PRED-RS & 0.094470 \\
    Biochemistry & NP & 0.252427 & NP-PP-PP PFORM & 0.200647 & PP-PRED-RS & 0.101942 \\
    Endocrinology & NP & 0.240283 & NP-PP-PP PFORM & 0.207303 & PP-PP & 0.089517 \\
    Rheumatology & NP & 0.283192 & NP-PP-PP PFORM & 0.203390 & PP & 0.133475 \\
    Science & NP-PP-PP PFORM & 0.224299 & NP & 0.190314 & PP-PP & 0.115548 \\
    Neurology & NP & 0.260030 & NP-PP-PP PFORM & 0.228826 & NP-NP-PRED & 0.123328 \\
    Virology & NP-PP-PP PFORM & 0.300000 & NP & 0.209524 & NP-NP-PRED & 0.102381 \\
    Microbiology & NP-PP-PP PFORM & 0.322925 & NP & 0.201828 & PP-PRED-RS & 0.105864 \\
    Cell Biology & NP-PP-PP PFORM & 0.389027 & NP & 0.182045 & PP-PP & 0.114713 \\
    Botany & NP & 0.214421 & NP-PP-PP PFORM & 0.204934 & NP-NP & 0.100569 \\
    Physiology & NP & 0.358191 & NP-PP-PP PFORM & 0.107579 & NP-NP-PRED & 0.074572 \\
    Environmental Health & NP & 0.385877 & NP-PP-PP PFORM & 0.091298 & NP-AS-NP-SC & 0.077746 \\
    Genetics & NP & 0.211664 & NP-PP-PP PFORM & 0.189040 & NP-AS-NP-SC & 0.096531 \\
    Molecular Biology & NP-PP-PP PFORM & 0.281690 & NP & 0.170775 & PP-PP & 0.070423 \\
    Geriatrics & NP & 0.346975 & NP-PRED-RS & 0.097865 & NP-PP-PRED & 0.088968 \\
    Critical Care & NP & 0.413424 & NP-PP-PP PFORM & 0.108949 & PP-PRED-RS & 0.090467 \\
    Gastroenterology & NP & 0.546099 & NP-PP-PP PFORM & 0.148936 & PP-PRED-RS & 0.083333 \\
    Public Health & NP & 0.342735 & NP-FOR-NP & 0.124786 & NP-AS-NP-SC & 0.101709 \\

    \hline
  \end{tabular}
  \caption{Top three SCFs, by subdomain, for {\it treat}.}
  \label{treat:table}
\end{figure*}
\clearpage
%\end{comment}





% \section{Theory/Calculations}

%
%% The Appendices part is started with the command \appendix; appendix
%% sections are then done as normal sections
%% \appendix

%% \section{}
%% \label{}

% \section{Appendices}



%% References
%%
%% Following citation commands can be used in the body text: Usage of
%% \cite is as follows: \cite{key} ==>> [#] \cite[chap. 2]{key} ==>>
%% [#, chap. 2] \citet{key} ==>> Author [#]

%% References with bibTeX database:

% \bibliographystyle{model3-num-names}
\bibliographystyle{plain} \bibliography{jbmi_bib}

%% Authors are advised to submit their bibtex database files. They are
%% requested to list a bibtex style file in the manuscript if they do
%% not want to use model3-num-names.bst.

%% References without bibTeX database:

% \begin{thebibliography}{00}

%% \bibitem must have the following form:
%% \bibitem{key}...
%%

% \bibitem{}

% \end{thebibliography}


\end{document}

%%
%% End of file `elsarticle-template-3-num.tex'.

